Image(filename='singapore.jpg', width=800, height=500)
from __future__ import print_function
import numpy as np
import pandas as pd
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gs
import folium
from folium import plugins
import shapefile
import colorsys
import pickle
import time
import warnings
from geopy.geocoders import Nominatim
from geopy.distance import great_circle
from IPython.display import Image, HTML
from collections import OrderedDict
import tensorflow as tf
from tensorflow.contrib import learn
from sklearn import preprocessing
from sklearn import metrics
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import make_pipeline
# Disable warnings emitted by warnings.warn calls from different packages
warnings.filterwarnings('ignore')
# Set style for plots (seaborn)
sns.set_context('notebook', font_scale=1.25)
sns.set_style('darkgrid')
# Matplotlib inline plots
%matplotlib inline
# Read data as csv (first part)
s01 = pd.read_csv('singapore-part-01.csv', skiprows=1)
s01 = s01.dropna()
s01 = s01[s01['Postal District']!=1.0] # remove postal district 01 (it is also in singapore.csv!)
(s01['Postal District']==1.0).sum() # if 0 then ok
# Read data as csv (second part)
s02 = pd.read_csv('singapore-part-02.csv', skiprows=1)
s02 = s02.dropna()
# Read data as csv (third part)
s03 = pd.read_csv('singapore.csv', skiprows=1)
s03 = s03.dropna()
# Merge datasets into a single database
sing = s01.append(s02, ignore_index=True)
sing = sing.append(s03, ignore_index=True)
singapore = sing.set_index(pd.to_datetime(sing['Date of Sale'].values, infer_datetime_format=True))
del singapore['S/N']
del singapore['Date of Sale']
del singapore['Nett Price ($)']
singapore.head(10)
Projects in this database are grouped by Core Central Region (CCR), Rest of Central Region (RCR) and Outside Central Region (OCR). Price ($) refers to the purchase price stated in the Option and S&P agreement and would have already deducted upfront direct discounts (e.g. early bird discount), if any.
# Map of Singapore postal districts
Image(filename='singapore-district-map.png', width=600, height=500)
# Largest real estate by area sold
singapore[singapore['Area (Sqm)']==singapore['Area (Sqm)'].max()]
# Most expensive square meter of real estate in Singapore
singapore[singapore['Unit Price ($psm)']==singapore['Unit Price ($psm)'].max()]
# Most expensive real estate sold in the last year
singapore[singapore['Price ($)']==singapore['Price ($)'].max()]
# Real estate size statistics
singapore['Area (Sqm)'].describe()
# Price per square-meter statistics
singapore['Unit Price ($psm)'].describe()
# Distribution of real estate area in square meters and prices per square meter
fig, ax = plt.subplots(2, 1, figsize=(7,6))
sns.distplot(singapore['Area (Sqm)'], kde=False, ax=ax[0])
axmin, axmax = ax[0].get_ylim()
ax[0].vlines(singapore['Area (Sqm)'].mean(), axmin, axmax, colors='navy', linestyles='-', label='mean value')
sns.distplot(singapore['Unit Price ($psm)'], kde=False, ax=ax[1])
ax[0].legend(loc='best')
plt.tight_layout()
plt.show()
# Changes in unit price and area during year (Jul 2015 - Jul 2016)
median_area = singapore['Area (Sqm)'].resample('1M').median()
mean_area = singapore['Area (Sqm)'].resample('1M').mean()
median_prices = singapore['Unit Price ($psm)'].resample('1M').median()
mean_prices = singapore['Unit Price ($psm)'].resample('1M').mean()
fig, ax = plt.subplots(2, 1, sharex=True,figsize=(8,6))
median_area.plot(ax=ax[0], label='median values')
mean_area.plot(ax=ax[0], label='mean values')
ax[0].set_ylabel('Area (Sqm)')
ax[0].legend(loc='best')
median_prices.plot(ax=ax[1], label='median values')
mean_prices.plot(ax=ax[1], label='mean values')
ax[1].set_ylabel('Unit Price ($psm)')
ax[1].legend(loc='best')
plt.tight_layout()
plt.show()
data_months = OrderedDict()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul']
months = singapore['Area (Sqm)'].ix['2016'].groupby(lambda x: x.month)
for name, i in zip(month_names, range(1, 8)):
data_months[name] = pd.Series(months.get_group(i).values)
data_pd = pd.DataFrame(data_months)
# Seaborn boxplot
fig, ax = plt.subplots(figsize=(8,6))
sns.boxplot(data=data_pd, orient='v', palette='Set2', ax=ax)
ax.set_xlabel('2016')
ax.set_ylabel('Area (Sqm)')
plt.tight_layout()
plt.show()
months = singapore['Unit Price ($psm)'].ix['2016'].groupby(lambda x: x.month)
data_months = OrderedDict()
for name, i in zip(month_names, range(1, 8)):
data_months[name] = pd.Series(months.get_group(i).values)
data_pd = pd.DataFrame(data_months)
# Seaborn boxplot
fig, ax = plt.subplots(figsize=(8,6))
sns.violinplot(data=data_pd, orient='v', palette='Set2', ax=ax)
ax.set_xlabel('2016')
ax.set_ylabel('Unit Price ($psm)')
plt.tight_layout()
plt.show()
# Average unit prices and average square area for each street address
streets = singapore.groupby('Street Name')[['Area (Sqm)', 'Unit Price ($psm)']].mean().reset_index()
streets.describe()
# Geocoding address names
geolocator = Nominatim()
lon = []
lat = []
for street_name in streets['Street Name']:
try:
location = geolocator.geocode(street_name + ' SINGAPORE')
lat.append(location.latitude)
lon.append(location.longitude)
time.sleep(0.5)
except AttributeError:
print('Address not found: {:s}'.format(street_name))
lat.append(np.NaN)
lon.append(np.NaN)
except GeocoderTimedOut as Geocoder:
print(Geocoder)
except ServiceTimedOut as Service:
print(Service)
except Exception as E:
print(E)
# Pickle geolocations
lon_lat = {'lon':lon, 'lat':lat}
with open('geolocations-all.pkl', 'wb') as file_name:
pickle.dump(lon_lat, file_name)
# Read pickled geolocations
with open('geolocations-all.pkl', 'rb') as file_name:
lon_lat = pickle.load(file_name)
lon = lon_lat['lon']
lat = lon_lat['lat']
# Average values for all streets
prices = streets['Unit Price ($psm)']
price_min = prices.min()
price_max = prices.max()
areas = streets['Area (Sqm)']
weights = prices/price_max
hist, bin_edges = np.histogram(weights)
sns.distplot(prices, bins=10, kde=False)
# Define colors for the colorbar
colors = []
for w in bin_edges:
c = '#%02x%02x%02x' % tuple(map(lambda x: int(x*255), colorsys.hsv_to_rgb(w, 1, 0.5)))
colors.append(c)
sns.palplot(sns.color_palette(colors))
# Shapefiles of Metro stations
myshp = open('TrainStation_Nov2015/TrainStations.shp', 'rb')
mydbf = open('TrainStation_Nov2015/TrainStations.dbf', 'rb')
sf = shapefile.Reader(shp=myshp, dbf=mydbf)
records = sf.shapeRecords()
# Singapore GPS system
from svy21 import SVY21
svy = SVY21()
# Folium map (average unit prices and square area of real estates)
mapa = folium.Map(location=[1.33, 103.84], zoom_start=12)
# Metro stations
for rec in records:
lon_m, lat_m = svy.computeLatLon(rec.shape.points[0][1], rec.shape.points[0][0])
name = rec.record[0]
folium.RegularPolygonMarker(location=[lon_m, lat_m], radius=4,
popup='{:s}'.format(name),
fill_color='black', number_of_sides=4).add_to(mapa)
# Real estates with prices
for x, y, p, a, w in zip(lon, lat, prices, areas, weights):
for k in range(1, len(bin_edges)):
if bin_edges[k-1] < w < bin_edges[k]:
c = colors[k]
if not np.isnan(x) and not np.isnan(y):
folium.RegularPolygonMarker(location=[y, x], radius=8,
popup='Area: {:g} (Sqm) \nUnit price {:g} ($psm)'.format(a, p),
fill_color=c, number_of_sides=12).add_to(mapa)
# Colormap
color_map = folium.colormap.LinearColormap(colors, vmin=price_min, vmax=price_max, caption='Unit Price ($psm)')
mapa.add_children(color_map)
mapa
# Number of real estates sold at each address
num_sold = singapore.groupby('Street Name')['Project Name'].count()
num_mean = num_sold.mean()
num_sold.describe()
# Folim map for the number of sales at each address
mapa = folium.Map(location=[1.33, 103.84], zoom_start=12)
for x, y, num in zip(lon, lat, num_sold):
if not np.isnan(x) and not np.isnan(y):
if num < 100:
rad = 50*num/num_mean
else:
rad = 25*num/num_mean
folium.CircleMarker(location=[y, x], radius=rad, popup='Units sold: {:d}'.format(num),
color='#3186cc', fill_color='#3186cc').add_to(mapa)
mapa
heat_map_data = []
for x, y, z in zip(lat, lon, weights):
if not np.isnan(x) and not np.isnan(y):
heat_map_data.append([x, y, z])
# Add small amount of randomness to the lat lon data points for better visualization
heat_map = (np.random.normal(loc=0, scale=0.00001, size=(len(heat_map_data), 3)) *
np.array([[1, 1, 0]]) +
np.array(heat_map_data)).tolist()
# Map of real estate transactions activity
mapa = folium.Map(location=[1.33, 103.84], zoom_start=12)
mapa.add_children(plugins.HeatMap(heat_map, radius=15))
mapa
# The most expensive real estate projects in Singapore (per square meter)
ss = singapore.groupby(['Project Name', 'Street Name', 'Area (Sqm)',
'Price ($)'])['Unit Price ($psm)'].max().reset_index()
top = ss.sort_values('Unit Price ($psm)', ascending=False)
top.head(10)
Image(filename='livingroom.jpg', width=600, height=400)
# Most expensive real estates in Singapore by total price paid ($)
ss = singapore.groupby(['Project Name', 'Street Name', 'Area (Sqm)',
'Unit Price ($psm)'])['Price ($)'].max().reset_index()
top_dollars = ss.sort_values('Price ($)', ascending=False).iloc[:10]
top_dollars
# Geocoding address names (top ten real estates)
geolocator = Nominatim()
lon_top = []
lat_top = []
for street_name in top_dollars['Street Name'].iloc[:10]:
try:
location = geolocator.geocode(street_name + ' SINGAPORE')
lat_top.append(location.latitude)
lon_top.append(location.longitude)
time.sleep(1)
except AttributeError:
print('Address not found: {:s}'.format(street_name))
lat_top.append(np.NaN)
lon_top.append(np.NaN)
except GeocoderTimedOut as Geocoder:
print(Geocoder)
except ServiceTimedOut as Service:
print(Service)
except Exception as E:
print(E)
print(top_dollars['Street Name'].iloc[:10].unique())
top_dollars['lon'] = lon_top
top_dollars['lat'] = lat_top
top_address = top_dollars.groupby('Street Name')['Price ($)'].max().reset_index()
top_address
# Folium map (top ten real estate addresses)
mapa = folium.Map(location=[1.28, 103.85], zoom_start=13)
for x, y, a, p in zip(lon_top, lat_top, top_dollars['Street Name'].iloc[:10], top_dollars['Price ($)'].iloc[:10]):
folium.RegularPolygonMarker(location=[y, x], radius=8,
popup='{:s}'.format(a),
fill_color='red', number_of_sides=12).add_to(mapa)
mapa
# Geocoding address names for 2016-07
geolocator = Nominatim()
lon = []
lat = []
for street_name in singapore['Street Name'].ix['2016-07']:
try:
location = geolocator.geocode(street_name + ' SINGAPORE')
lat.append(location.latitude)
lon.append(location.longitude)
time.sleep(0.5)
except AttributeError:
print('Address not found: {:s}'.format(street_name))
lat.append(np.NaN)
lon.append(np.NaN)
except GeocoderTimedOut as Geocoder:
print(Geocoder)
except ServiceTimedOut as Service:
print(Service)
except Exception as E:
print(E)
# Pickle geolocations
lon_lat = {'lon':lon, 'lat':lat}
with open('geolocations-july.pkl', 'wb') as file_name:
pickle.dump(lon_lat, file_name)
with open('geolocations-july.pkl', 'rb') as file_name:
lon_lat = pickle.load(file_name)
lon = lon_lat['lon']
lat = lon_lat['lat']
# Values for July-2016
prices = singapore['Unit Price ($psm)'].ix['2016-07']
price_min = singapore['Unit Price ($psm)'].ix['2016-07'].min()
price_max = singapore['Unit Price ($psm)'].ix['2016-07'].max()
areas = singapore['Area (Sqm)'].ix['2016-07']
weights = prices/price_max
hist, bin_edges = np.histogram(weights)
# Define colors for the colorbar
colors = []
for w in bin_edges:
c = '#%02x%02x%02x' % tuple(map(lambda x: int(x*255), colorsys.hsv_to_rgb(w, 1, 0.5)))
colors.append(c)
# Folium map (prices for July-2016)
mapa = folium.Map(location=[1.33, 103.84], zoom_start=12)
# Metro stations
for rec in records:
lon_m, lat_m = svy.computeLatLon(rec.shape.points[0][1], rec.shape.points[0][0])
name = rec.record[0]
folium.RegularPolygonMarker(location=[lon_m, lat_m], radius=4,
popup='{:s}'.format(name),
fill_color='black', number_of_sides=4).add_to(mapa)
# Real estates with prices
for x, y, p, a, w in zip(lon, lat, prices, areas, weights):
for k in range(1, len(bin_edges)):
if bin_edges[k-1] < w < bin_edges[k]:
c = colors[k]
if x is not np.NaN and y is not np.NaN:
folium.RegularPolygonMarker(location=[y, x], radius=8,
popup='Area: {:g} (Sqm) \nUnit price {:g} ($psm)'.format(a, p),
fill_color=c, number_of_sides=12).add_to(mapa)
# Colormap
color_map_july = folium.colormap.LinearColormap(colors, vmin=price_min, vmax=price_max,
caption='Unit Price ($psm)')
mapa.add_children(color_map_july)
mapa
# Create new pandas DataFrame for the July-2016
july = pd.DataFrame({'unit_price':prices, 'area':areas}, index=prices.index)
july['floor'] = singapore['Floor Level'].ix['2016-07']
july['address'] = singapore['Street Name'].ix['2016-07']
july['project'] = singapore['Project Name'].ix['2016-07']
july['lon'] = lon
july['lat'] = lat
july.head()
# Real estates within certain distance from the select metro station
distance_max = 1100. # meters
station_name = 'Bartley'
distance = []
for rec in records:
if rec.record[0].title() == station_name:
print('Found: {:s}'.format(station_name))
lat_m, lon_m = svy.computeLatLon(rec.shape.points[0][1], rec.shape.points[0][0])
metro = (lat_m, lon_m)
for lon_h, lat_h in zip(july['lon'], july['lat']):
house = (lat_h, lon_h)
dist = great_circle(metro, house).meters
if dist <= distance_max:
distance.append(dist)
else:
distance.append(np.NaN)
july['distance'] = distance
july_station = july.dropna()
july_station
print(july_station['project'].unique())
july_station['unit_price'].describe()
# Distribution of unit prices of real estates within the
# select distance from the metro station under consideration
fig, ax = plt.subplots(2, 1, sharex=True, figsize=(6,6))
sns.distplot(july_station['unit_price'], ax=ax[0])
sns.violinplot(july_station['unit_price'], ax=ax[1])
ax[0].set_xlabel('')
ax[1].set_xlabel('Unit Price ($psm)')
# Average area and average unit price per project for July 2016
projects = july_station.groupby('project')[['area', 'unit_price', 'lat', 'lon']].mean().reset_index()
projects
# Folium map with average prices of real estates within select distance from the metro station
mapa = folium.Map(location=[1.35, 103.87], zoom_start=14)
folium.Marker(location=metro, popup=station_name, icon=folium.Icon(icon='info-sign')).add_to(mapa)
folium.CircleMarker(location=metro, radius=distance_max, popup='{:s}'.format(station_name),
color='#ffffcc', fill_color='#ffffcc').add_to(mapa)
for lon_h, lat_h, name, price in zip(projects['lat'], projects['lon'],
projects['project'], projects['unit_price']):
folium.RegularPolygonMarker(location=[lon_h, lat_h], radius=8,
popup='{:s} Unit price: {:g} ($psm)'.format(name,price),
fill_color='red', number_of_sides=12).add_to(mapa)
mapa
# Real estates with unit prices below the mean price and with the square area of over 80 Sqm
july_station[(july_station['unit_price']<july_station['unit_price'].mean()) & (july_station['area']>80.)]
Image(filename='minton_sg.jpg', width=600, height=400)
# The Minton real estate sales within the last year
minton = singapore[singapore['Project Name']=='THE MINTON']
minton[['Price ($)', 'Area (Sqm)', 'Unit Price ($psm)']].describe()
# Distribution of unit prices for The Minton
sns.distplot(minton['Unit Price ($psm)'])
# Time evolution of the Unit Price ($psm) for The Minton residences
minton = singapore[singapore['Project Name']=='THE MINTON'].resample('1M').mean()
minton_min = singapore[singapore['Project Name']=='THE MINTON'].resample('1M').min()
minton_max = singapore[singapore['Project Name']=='THE MINTON'].resample('1M').max()
minton_count = singapore[singapore['Project Name']=='THE MINTON'].resample('1M').count()
fig = plt.figure(1, figsize=(7,6))
grid = gs.GridSpec(2, 1, height_ratios=[3,1])
ax1 = plt.subplot(grid[0,0])
ax2 = plt.subplot(grid[1,0])
minton['Unit Price ($psm)'].plot(ls='-', label='mean value', ax=ax1)
minton_min['Unit Price ($psm)'].plot(ls='--', c='orange', label='min value', ax=ax1)
minton_max['Unit Price ($psm)'].plot(ls='-.', c='orange', label='max value', ax=ax1)
ax1.set_title('Monthly Unit Price ($psm) change')
ax1.set_ylabel('Unit Price ($psm)')
ax1.legend(loc='lower left')
ax2.bar(minton_count.index, minton_count['No. of Units'], width=10)
ax2.set_ylabel('Units sold')
ax2.set_xticklabels('')
plt.tight_layout()
plt.show()
singapore[singapore['Project Name']=='THE MINTON'].ix['2016-02']
# Real estate deals in "The Minton" residences
# Real estates above the fifth floor with unit prices below the cheapest 5% of the neighborhood
# Unit prices of the neighborhood are for the July 2016
minton = singapore[singapore['Project Name']=='THE MINTON']
minton[(minton['Unit Price ($psm)'] < july_station['unit_price'].quantile(0.05)) &
(minton['Floor Level'].values != '01 to 05')]
# Encode: Market Segment
le_ms = preprocessing.LabelEncoder()
le_ms.fit(['OCR', 'RCR', 'CCR'])
ms_transf = le_ms.transform(singapore['Market Segment'])
# Encode: Floor Level
floors = list(singapore['Floor Level'].unique())
floors.sort()
le_level = preprocessing.LabelEncoder()
le_level.fit(floors)
floor_transf = le_level.transform(singapore['Floor Level'])
# Encode: Project name
project = list(singapore['Project Name'].unique())
project.sort()
le_project = preprocessing.LabelEncoder()
le_project.fit(project)
project_transf = le_project.transform(singapore['Project Name'])
# Encode: Street name
street = list(singapore['Street Name'].unique())
street.sort()
le_street = preprocessing.LabelEncoder()
le_street.fit(street)
street_transf = le_street.transform(singapore['Street Name'])
# Encode: Type of Sale
sale = list(singapore['Type of Sale'].unique())
sale.sort()
le_sale = preprocessing.LabelEncoder()
le_sale.fit(sale)
sale_transf = le_sale.transform(singapore['Type of Sale'])
# Assemble features into a DataFrame
sing = singapore[['Postal District', 'Area (Sqm)']].copy()
sing['Market Segment'] = ms_transf
sing['Floor Level'] = floor_transf
sing['Project Name'] = project_transf
sing['Street Name'] = street_transf
sing['Type of Sale'] = sale_transf
sing.head()
y = singapore['Unit Price ($psm)'].copy()
y.head()
# Predictivity
pearson = sing.corr('pearson')
pearson.ix[-1][:-1].sort_values()
# Correlation matrix as heatmap (seaborn)
fig, ax = plt.subplots(figsize=(7,6))
sns.heatmap(pearson, annot=True, annot_kws=dict(size=12), vmin=-1, vmax=1, ax=ax)
plt.tight_layout()
plt.show()
# Split data into train and test leaving last ten (10) rows for the final evaluation
X_train, X_test, y_train, y_test = train_test_split(sing.iloc[:-10], y.iloc[:-10], test_size=0.2)
print('Database size: {:g} \nTrain set size: {:g} \nTest set size: {:g} \nValidate set size: {:g}'
.format(len(singapore), len(y_train), len(y_test), len(singapore)-(len(y_train)+len(y_test))))
# Scale data
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Train & evaluate model performance
def train_and_evaluate(model, X, y):
model.fit(X, y)
print('Score on training set: {:g}'.format(model.score(X, y)))
# k-fold cross validation iterator of 5 folds
cv = KFold(X.shape[0], 5, shuffle=True)
scores = cross_val_score(model, X, y, cv=cv)
print('Score using 5-fold cross-validation: {:g} +/- {:g}'.format(np.mean(scores), np.std(scores)*2))
# Ordinary least squares Linear Regression
clf_lr = linear_model.LinearRegression()
train_and_evaluate(clf_lr, X_train, y_train.ravel())
# Test model prediction on test data
y_pred = clf_lr.predict(X_test)
print('R2 score: {:g}'.format(metrics.r2_score(y_test, y_pred)))
print('Explained variance score: {:g}'.format(metrics.explained_variance_score(y_test, y_pred)))
print('Mean squared error: {:g}'.format(metrics.mean_squared_error(y_test, y_pred)))
# Residual plot
fig, ax = plt.subplots(figsize=(7,5))
ax.set_title('Ordinary least squares (residuals)')
ax.scatter(y_test, y_pred-y_test, color='royalblue', s=30)
ax.set_ylabel('Residuals')
ax.hlines(0, y_test.min(), y_test.max())
plt.tight_layout()
plt.show()
model = make_pipeline(preprocessing.PolynomialFeatures(degree=3), linear_model.Ridge())
train_and_evaluate(model, X_train, y_train.ravel())
# Test model prediction on test data
y_pred = model.predict(X_test)
print('R2 score: {:g}'.format(metrics.r2_score(y_test, y_pred)))
print('Explained variance score: {:g}'.format(metrics.explained_variance_score(y_test, y_pred)))
print('Mean squared error: {:g}'.format(metrics.mean_squared_error(y_test, y_pred)))
# Support Vector Regression
svr = SVR()
params = {'C':[0.01, 0.1, 1., 10., 100., 1000.],
'kernel':['linear', 'poly', 'rbf'],
'epsilon':[0.1, 1.]}
# Grid search with cross-validation for optimal parameters
mach = GridSearchCV(estimator=svr, param_grid=params, cv=5, n_jobs=-1)
mach.fit(X_train, y_train.ravel())
# Test model prediction on test data
y_pred = mach.predict(X_test)
print('R2 score: {:g}'.format(metrics.r2_score(y_test, y_pred)))
print('Explained variance score: {:g}'.format(metrics.explained_variance_score(y_test, y_pred)))
print('Mean squared error: {:g}'.format(metrics.mean_squared_error(y_test, y_pred)))
# DecisionTreeRegressor
tree = DecisionTreeRegressor()
train_and_evaluate(tree, X_train, y_train.ravel())
# Test model prediction on test data
y_pred = tree.predict(X_test)
print('R2 score: {:g}'.format(metrics.r2_score(y_test, y_pred)))
print('Explained variance score: {:g}'.format(metrics.explained_variance_score(y_test, y_pred)))
print('Mean squared error: {:g}'.format(metrics.mean_squared_error(y_test, y_pred)))
# RandomForrestRegressor
forrest = RandomForestRegressor()
train_and_evaluate(forrest, X_train, y_train.ravel())
# Test model prediction on test data
y_pred = forrest.predict(X_test)
print('R2 score: {:g}'.format(metrics.r2_score(y_test, y_pred)))
print('Explained variance score: {:g}'.format(metrics.explained_variance_score(y_test, y_pred)))
print('Mean squared error: {:g}'.format(metrics.mean_squared_error(y_test, y_pred)))
# Random Forests (Extra Trees)
clf_et = ExtraTreesRegressor()
train_and_evaluate(clf_et, X_train, y_train.ravel())
# Test model prediction on test data
y_pred = clf_et.predict(X_test)
print('R2 score: {:g}'.format(metrics.r2_score(y_test, y_pred)))
print('Explained variance score: {:g}'.format(metrics.explained_variance_score(y_test, y_pred)))
print('Mean squared error: {:g}'.format(metrics.mean_squared_error(y_test, y_pred)))
fig, ax = plt.subplots(figsize=(6,6))
ax.set_title('Random Forests (Extra Trees)')
ax.scatter(y_test, y_pred, s=30, color='royalblue')
ax.plot(y_test, y_test, c='red', ls='-', lw=2)
ax.set_xlabel('True values')
ax.set_ylabel('Predicted values')
plt.tight_layout()
plt.show()
# AdaBoost Regressor with DecisionTree
clf_ad = AdaBoostRegressor(DecisionTreeRegressor(max_depth=10), n_estimators=100)
train_and_evaluate(clf_ad, X_train, y_train.ravel())
# Test model prediction on test data
y_pred = clf_ad.predict(X_test)
print('R2 score: {:g}'.format(metrics.r2_score(y_test, y_pred)))
print('Explained variance score: {:g}'.format(metrics.explained_variance_score(y_test, y_pred)))
print('Mean squared error: {:g}'.format(metrics.mean_squared_error(y_test, y_pred)))
# Gradient Boosting Regressor
clf_gb = GradientBoostingRegressor(loss='lad', max_depth=10, n_estimators=100)
train_and_evaluate(clf_gb, X_train, y_train.ravel())
# Test model prediction on test data
y_pred = clf_gb.predict(X_test)
print('R2 score: {:g}'.format(metrics.r2_score(y_test, y_pred)))
print('Explained variance score: {:g}'.format(metrics.explained_variance_score(y_test, y_pred)))
print('Mean squared error: {:g}'.format(metrics.mean_squared_error(y_test, y_pred)))
# Feature importance
feature_importance = clf_gb.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
fig, ax = plt.subplots(figsize=(7,4))
ax.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, sing.columns[sorted_idx])
ax.set_xlabel('Feature Relative Importance')
plt.tight_layout()
plt.show()
# TensorFlow Regression using DNN
features = learn.infer_real_valued_columns_from_input(X_train)
# Optimizer algorithm
adam = tf.train.AdamOptimizer(learning_rate=0.001)
# Build a multi-layer DNN for regression
model_nn = learn.DNNRegressor(feature_columns=features, hidden_units=[400, 200, 100],
optimizer=adam, activation_fn=tf.nn.relu,
config=learn.estimators.RunConfig(num_cores=8))
# Fit and predict
model_nn.fit(X_train, y_train, steps=15000, batch_size=512)
y_pred = model_nn.predict(X_test)
print('R2 score: {:g}'.format(metrics.r2_score(y_test, y_pred)))
print('Explained variance score: {:g}'.format(metrics.explained_variance_score(y_test, y_pred)))
print('Mean squared error: {:g}'.format(metrics.mean_squared_error(y_test, y_pred)))
y_lr = clf_lr.predict(X_test) # Linear
y_po = model.predict(X_test) # Polinomial
y_vm = mach.predict(X_test) # Support Vector Machine
y_tr = tree.predict(X_test) # DecisionTree
y_rf = forrest.predict(X_test) # Random Forrest
y_gb = clf_gb.predict(X_test) # Gradient Boosting
y_ad = clf_ad.predict(X_test) # AdaBoost
y_nn = model_nn.predict(X_test) # DNN
stack = pd.DataFrame(data={'lr':y_lr, 'po':y_po, 'vm':y_vm, 'tr':y_tr, 'rf':y_rf,
'gb':y_gb, 'ad':y_ad, 'nn':y_nn, 'true':y_test})
pearson_models = stack.corr('pearson')
# Correlation matrix as heatmap (seaborn)
fig, ax = plt.subplots(figsize=(7,6))
sns.heatmap(pearson_models, annot=True, annot_kws=dict(size=12), vmin=0, vmax=1, ax=ax)
plt.tight_layout()
plt.show()
# SVR + Gradient Boosting + AdaBoost + DNN
Image(filename='graph.jpg', width=600, height=400)
# Stacking & Ensembling multiple regressors
stack_type = 'TheilSan'
if stack_type == 'OLS':
# Ordinary least squares Linear Regression (OLS)
stacker = linear_model.LinearRegression(fit_intercept=False)
elif stack_type == 'Ridge':
# Linear least squares with L2 regularization (Ridge)
stacker = linear_model.Ridge(fit_intercept=False)
elif stack_type == 'TheilSan':
# Theil-Sen Estimator: robust multivariate regression model
stacker = linear_model.TheilSenRegressor(fit_intercept=False, n_jobs=-1)
# SVM + Gradient Boosting + AdaBoost + DNN
stacker.fit(X=stack[['vm', 'gb', 'ad', 'nn']], y=stack['true'])
# Stacking multiple regressors using DNN
features_blend = learn.infer_real_valued_columns_from_input(stack[['vm', 'gb', 'ad', 'nn']])
# Optimizer algorithm
adam = tf.train.AdamOptimizer(learning_rate=0.001)
# Build multi-layer DNN for regression
blend_nn = learn.DNNRegressor(feature_columns=features_blend, hidden_units=[20, 10],
optimizer=adam, activation_fn=tf.nn.relu,
config=learn.estimators.RunConfig(num_cores=8))
# Fit DNN
blend_nn.fit(x=stack[['vm', 'gb', 'ad', 'nn']], y=stack['true'], steps=8000, batch_size=256)
singapore.tail(10)
# Pedict price for new apartments (not seen before)
x_new = sing.iloc[-10:] # example
# Scale data
x_new = scaler.transform(x_new)
# True values (actual prices)
true_values = y.iloc[-10:].values
# PREDICT: Ordinary least squares Linear Regression
y_new_lr = clf_lr.predict(x_new)
# PREDICT: Polinomial Regression
y_new_po = model.predict(x_new)
# PREDICT: Support Vector Regression
y_new_vm = mach.predict(x_new)
# PREDICT: DecisionTreeRegressor
y_new_tr = tree.predict(x_new)
# PREDICT: RandomForestRegressor
y_new_rf = forrest.predict(x_new)
# PREDICT: Random Forests (Extra Trees)
y_new_et = clf_et.predict(x_new)
# PREDICT: AdaBoost Regressor
y_new_ad = clf_ad.predict(x_new)
# PREDICT: Gradient Boosting for regression
y_new_gb = clf_gb.predict(x_new)
# PREDICT: TensorFlow DNNRegressor
y_new_nn = np.around(model_nn.predict(x_new), decimals=2)
# Relative errors (%) for predictions
lr_error = (1.-y_new_lr/true_values)*100.
vm_error = (1.-y_new_vm/true_values)*100.
tr_error = (1.-y_new_tr/true_values)*100.
et_error = (1.-y_new_et/true_values)*100.
rf_error = (1.-y_new_rf/true_values)*100.
po_error = (1.-y_new_po/true_values)*100.
ad_error = (1.-y_new_ad/true_values)*100.
gb_error = (1.-y_new_gb/true_values)*100.
nn_error = (1.-y_new_nn/true_values)*100.
predictions = pd.DataFrame(data={'True Values':y.iloc[-10:].values,
'OLS':y_new_lr, 'OLS Err':lr_error, # OLS - Ordinary Least Squares
'SVR':y_new_vm, 'SVR Err':vm_error, # SVR - Support Vector Machine
'DTR':y_new_tr, 'DTR Err':tr_error, # DTR - Decision Trees regressor
'RFR':y_new_rf, 'RFR Err':rf_error, # RFR - Random Forrest regressor
'ETR':y_new_et, 'ETR Err':et_error, # ETR - Extra Trees regressor
'ADA':y_new_ad, 'ADA Err':ad_error, # ADA - AdaBoost regressor
'GBR':y_new_gb, 'GBR Err':gb_error, # GBR - Gradient Boost regresor
'DNN':y_new_nn, 'DNN Err':nn_error}) # DNN - Deep Neural Network
predictions[['True Values', 'OLS Err', 'SVR Err', 'DTR Err',
'RFR Err', 'ETR Err', 'ADA Err', 'GBR Err', 'DNN Err']].round(decimals=2)
# Root Mean Square Error (between the logarithm of predicted and true values)
algos_predicted = predictions[['ADA', 'DNN', 'DTR', 'ETR', 'GBR', 'OLS', 'RFR', 'SVR']]
for col in algos_predicted:
rms_log = metrics.mean_squared_error(np.log(predictions['True Values']), np.log(algos_predicted[col]))
print('{:s}: RMSE = {:g}'.format(col, np.sqrt(rms_log)))
# SVM + Gradient Boosting + AdaBoost + DNN
new_stack = pd.DataFrame(data={'vm':y_new_vm, 'gb':y_new_gb, 'ad':y_new_ad, 'nn':y_new_nn, 'true':true_values})
# PREDICT: Stacked regressors (Stack)
y_new_st = stacker.predict(new_stack[['vm', 'gb', 'ad', 'nn']])
# Relative errors (%) for predictions
st_error = (1.-y_new_st/true_values)*100.
# PREDICT: Stacked regressors using DNN (Ensamble)
y_new_bl = blend_nn.predict(new_stack[['vm', 'gb', 'ad', 'nn']])
bl_error = (1.-y_new_bl/true_values)*100.
# Averageing best predictions from different regressors
# Extra Trees + Gradient Boosting + AdaBoost
w = [1, 1, 1] # weights
y_new_agg = np.c_[y_new_et, y_new_gb, y_new_ad]
y_new_avr = np.average(y_new_agg, axis=1, weights=w)
avr_error = (1.-y_new_avr/true_values)*100.
predictions = (('True Values',y.iloc[-10:].values),
('Average',y_new_avr), ('Average Err',avr_error),
('Stack',y_new_st), ('Stack Err',st_error),
('Ensamble',y_new_bl), ('Ensamble Err',bl_error))
preds_avr = pd.DataFrame(data=OrderedDict(predictions))
preds_avr.round(decimals=2)
# Root Mean Square Error (RMSE) between the logarithm of the
# predicted value and the logarithm of the observed sales price
rms_log = metrics.mean_squared_error(np.log(preds_avr['True Values']), np.log(preds_avr['Average']))
print(' Average: RMSE = {:g}'.format(np.sqrt(rms_log)))
rms_log = metrics.mean_squared_error(np.log(preds_avr['True Values']), np.log(preds_avr['Stack']))
print(' Stack: RMSE = {:g}'.format(np.sqrt(rms_log)))
rms_log = metrics.mean_squared_error(np.log(preds_avr['True Values']), np.log(preds_avr['Ensamble']))
print(' Ensamble: RMSE = {:g}'.format(np.sqrt(rms_log)))
Disclaimer: This notebook is furnished "as is". The author does not provide any warranty whatsoever, whether express, implied, or statutory, including, but not limited to, any warranty of merchantability or fitness for a particular purpose or any warranty that the contents of the notebook will be error-free. In no respect shall the author incur any liability for any damages, including, but limited to, direct, indirect, special, or consequential damages arising out of, resulting from, or any way connected to the use of the notebook material, whether or not based upon warranty, contract, tort, or otherwise; whether or not injury was sustained by persons or property or otherwise; and whether or not loss was sustained from, or arose out of, the usage of the results of the notebook material.